import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Image
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")


# Import dataset
df = pd.read_csv('Country-data.csv')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167 entries, 0 to 166
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   country     167 non-null    object 
 1   child_mort  167 non-null    float64
 2   exports     167 non-null    float64
 3   health      167 non-null    float64
 4   imports     167 non-null    float64
 5   income      167 non-null    int64  
 6   inflation   167 non-null    float64
 7   life_expec  167 non-null    float64
 8   total_fer   167 non-null    float64
 9   gdpp        167 non-null    int64  
dtypes: float64(7), int64(2), object(1)
memory usage: 13.2+ KB


df.describe()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='gdpp',
                    title='Coutries by gdpp'
                   )
fig.show()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='child_mort',
                    title='Coutries by child_mort'
                   )
fig.show()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='income',
                    title='Coutries by income'
                   )
fig.show()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='exports',
                    title='Coutries by exports'
                   )
fig.show()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='imports',
                    title='Coutries by imports'
                   )
fig.show()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='health',
                    title='Coutries by health'
                   )
fig.show()


fig = px.choropleth(df,
                    locationmode='country names',
                    locations='country',
                    color='inflation',
                    title='Coutries by inflation'
                   )
fig.show()


# remove country colomm
df_without_countries = df.drop(['country'],axis=1)


df_without_countries.head()


ks = range(1,6)
inertias = []

for i in ks:
    model = KMeans(i)
    model.fit(df_without_countries)
    inertias.append(model.inertia_)


plt.plot(ks,inertias)
plt.xlabel('Number of Clusters')
plt.ylabel('Inertias')
plt.show()


scaler = StandardScaler()
kmeans = KMeans(n_clusters=3,random_state=42)
pipeline = make_pipeline(scaler,kmeans)


pipeline.fit(df_without_countries)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3, random_state=42))])

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kmeans', KMeans(n_clusters=3, random_state=42))])

StandardScaler()

KMeans(n_clusters=3, random_state=42)


clusters = pipeline.predict(df_without_countries)


clusters

array([1, 2, 2, 1, 2, 2, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 2, 1, 2, 2, 2, 1,
       2, 0, 2, 1, 1, 2, 1, 0, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 1, 2, 0, 0,
       0, 2, 2, 2, 2, 1, 1, 2, 2, 0, 0, 1, 1, 2, 0, 1, 0, 2, 2, 1, 1, 2,
       1, 2, 0, 2, 2, 2, 1, 0, 0, 0, 2, 0, 2, 2, 1, 1, 0, 2, 1, 2, 2, 1,
       1, 2, 2, 0, 2, 1, 1, 2, 2, 1, 0, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2,
       0, 0, 1, 1, 0, 2, 1, 2, 2, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 1, 2, 2,
       1, 0, 0, 0, 2, 1, 0, 0, 2, 2, 1, 2, 0, 0, 2, 1, 2, 1, 1, 2, 2, 2,
       2, 1, 2, 0, 0, 0, 2, 2, 2, 2, 2, 1, 1])


country = df.iloc[:,0]


final = pd.DataFrame({'clusters':clusters,'country':country})
final.head()


fig = px.choropleth(final,
                    locationmode='country names',
                    locations='country',
                    color='clusters',
                    title='Coutries by clusters'
                   )
fig.show()


final_df = pd.merge(final, df, on='country')


final_df[final_df['clusters']==1]. head()


final_df[final_df['clusters']==0].head()


final_df[final_df['clusters']==2].head()

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
0	Afghanistan	90.2	10.0	7.58	44.9	1610	9.44	56.2	5.82	553
1	Albania	16.6	28.0	6.55	48.6	9930	4.49	76.3	1.65	4090
2	Algeria	27.3	38.4	4.17	31.4	12900	16.10	76.5	2.89	4460
3	Angola	119.0	62.3	2.85	42.9	5900	22.40	60.1	6.16	3530
4	Antigua and Barbuda	10.3	45.5	6.03	58.9	19100	1.44	76.8	2.13	12200

	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
count	167.000000	167.000000	167.000000	167.000000	167.000000	167.000000	167.000000	167.000000	167.000000
mean	38.270060	41.108976	6.815689	46.890215	17144.688623	7.781832	70.555689	2.947964	12964.155689
std	40.328931	27.412010	2.746837	24.209589	19278.067698	10.570704	8.893172	1.513848	18328.704809
min	2.600000	0.109000	1.810000	0.065900	609.000000	-4.210000	32.100000	1.150000	231.000000
25%	8.250000	23.800000	4.920000	30.200000	3355.000000	1.810000	65.300000	1.795000	1330.000000
50%	19.300000	35.000000	6.320000	43.300000	9960.000000	5.390000	73.100000	2.410000	4660.000000
75%	62.100000	51.350000	8.600000	58.750000	22800.000000	10.750000	76.800000	3.880000	14050.000000
max	208.000000	200.000000	17.900000	174.000000	125000.000000	104.000000	82.800000	7.490000	105000.000000

	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
0	90.2	10.0	7.58	44.9	1610	9.44	56.2	5.82	553
1	16.6	28.0	6.55	48.6	9930	4.49	76.3	1.65	4090
2	27.3	38.4	4.17	31.4	12900	16.10	76.5	2.89	4460
3	119.0	62.3	2.85	42.9	5900	22.40	60.1	6.16	3530
4	10.3	45.5	6.03	58.9	19100	1.44	76.8	2.13	12200

	clusters	country
0	1	Afghanistan
1	2	Albania
2	2	Algeria
3	1	Angola
4	2	Antigua and Barbuda

	clusters	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
0	1	Afghanistan	90.2	10.0	7.58	44.9	1610	9.440	56.2	5.82	553
3	1	Angola	119.0	62.3	2.85	42.9	5900	22.400	60.1	6.16	3530
17	1	Benin	111.0	23.8	4.10	37.2	1820	0.885	61.8	5.36	758
21	1	Botswana	52.5	43.6	8.30	51.3	13300	8.920	57.1	2.88	6350
25	1	Burkina Faso	116.0	19.2	6.74	29.6	1430	6.810	57.9	5.87	575

Exploring Global Socioeconomic Clusters¶

Objectives¶

Data Source¶

Navigating the Analysis¶

Unveiling Cluster Insights¶

Importing Libraries and Datasets¶

Exploring data¶

GDP per country¶

Death of children under 5 years of age per 1000 live births¶

Net income per person¶

Exports of goods and services per capita¶

Imports of goods and services per capita¶

Total health spending per capita¶

inflation¶

Applying KMeans¶

Clustring Quality¶

Depending on elbow method, we will use 3 clusters¶

Creating a pipeline¶

Examining the different clusters¶

Insight¶

Cluster 1: Countries in Dire Need of Aid¶

Cluster 0: Developed and Prosperous Countries¶

Cluster 2: Intermediate Position Countries¶

	country	child_mort	exports	health	imports	income	inflation	life_expec	total_fer	gdpp
7	Australia	4.8	19.8	8.73	20.9	41400	1.160	82.0	1.93	51900
8	Austria	4.3	51.3	11.00	47.8	43200	0.873	80.5	1.44	46900
11	Bahrain	8.6	69.5	4.97	50.9	41100	7.440	76.0	2.16	20700
15	Belgium	4.5	76.4	10.70	74.7	41100	1.880	80.0	1.86	44400
23	Brunei	10.5	67.4	2.84	28.0	80600	16.700	77.1	1.84	35300